In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statistics
import seaborn as sns
def print_full_dataframe(df):
with pd.option_context('display.max_columns', None, 'display.expand_frame_repr', False):
print(df)
df = pd.read_csv('master.csv')
df2 = pd.read_csv('share-with-anxiety-disorders.csv')
df3 = pd.read_csv('Countries_GDP_1960-2020.csv')
df4 = pd.read_csv('DP_LIVE_13042023005821788.csv', encoding='latin1')
print("df.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df.isnull().sum())
print()
print()
print()
print()
print("df2.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df2.isnull().sum())
print()
print()
print()
print()
print("df3.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
null_values_df = df3.isnull().sum().to_frame().T
print_full_dataframe(null_values_df)
print()
print()
print()
print()
print("df4.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df4.isnull().sum())
print()
print()
print()
print()
print("df")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df)
print()
print()
print()
print()
print("df2")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df2)
print()
print()
print()
print()
print("df3")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df3)
print()
print()
print()
print()
print("df4")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df4)
print()
print()
print()
print()
#Start filtering the Dataframes
df['HDI for year'].fillna(df['HDI for year'].mean(), inplace=True)# convert the ' gdp_for_year ($) ' column to a numeric type
df[' gdp_for_year ($) '] = df[' gdp_for_year ($) '].str.replace(',', '').astype(float)
filtered_df = df[(df['year'] >= 2000) & (df['year'] <= 2015)]
df2_filtered = df2[(df2['Year'] >= 2000) & (df2['Year'] <= 2015)]
df2_filtered = df2_filtered.drop('Code', axis=1)
years_to_keep = [str(year) for year in range(2000, 2016)] # List of strings containing years from 2000 to 2015
columns_to_keep = ['Country Name', 'Country Code'] + years_to_keep
df3_filtered = df3[columns_to_keep]
df4_filtered = df4[(df4['TIME'] >= 2000) & (df4['TIME'] <= 2015)]
df4_filtered = df4_filtered.rename(columns={"\"LOCATION\"": "LOCATION"})
df4_filtered = df4_filtered.drop('Flag Codes', axis=1)
df4_filtered = df4_filtered.drop('SUBJECT',axis=1)
df4_filtered = df4_filtered.drop('MEASURE',axis=1)
df4_filtered = df4_filtered.drop('FREQUENCY',axis=1)
df4_filtered = df4_filtered.drop('INDICATOR',axis=1)
print("filtered_df.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(filtered_df.isnull().sum())
filtered_df.to_csv('filtered_df.csv', index=False)
print()
print()
print()
print()
print("df2_filtered.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df2_filtered.isnull().sum())
df2_filtered.to_csv('df2_filtered.csv', index=False)
print()
print()
print()
print()
print("df3_filtered.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
null_values_df = df3_filtered.isnull().sum().to_frame().T
print_full_dataframe(null_values_df)
df3_filtered.to_csv('df3_filtered.csv', index=False)
print()
print()
print()
print()
print("df4_filtered.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df4_filtered.isnull().sum())
df4_filtered.to_csv('df4_filtered.csv', index=False)
print()
print()
print()
print()
print("filtered_df")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(filtered_df)
print()
print()
print()
print()
print("df2_filtered")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df2_filtered)
print()
print()
print()
print()
print("df3_filtered")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df3_filtered)
print()
print()
print()
print()
print("df4_filtered")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df4_filtered)
print()
print()
print()
print()
average_suicide_rates_by_country = filtered_df.groupby('country')['suicides/100k pop'].mean().sort_values(ascending=False)
print("average_suicide_rates_by_country = filtered_df.groupby('country')['suicides/100k pop'].mean().sort_values(ascending=False)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_country)
print()
print()
print()
print()
average_suicide_rates_by_country_and_year = filtered_df.groupby(['year','country'])['suicides/100k pop'].mean()
#average_suicide_rates_by_country_and_year.to_csv('average_suicide_rates_by_country_and_year.csv', index=True)
print("average_suicide_rates_by_country_and_year = filtered_df.groupby(['year','country'])['suicides/100k pop'].mean().sort_values(ascending=False)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_country_and_year)
print()
print()
print()
print()
average_suicide_rates_by_gender = filtered_df.groupby('sex')['suicides/100k pop'].mean()
print("average_suicide_rates_by_gender = filtered_df.groupby('sex')['suicides/100k pop'].mean()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_gender)
print()
print()
print()
print()
average_suicide_rates_by_gender_and_year = filtered_df.groupby(['year','sex'])['suicides/100k pop'].mean()
#average_suicide_rates_by_gender_and_year.to_csv('average_suicide_rates_by_gender_and_year.csv', index=True)
print("average_suicide_rates_by_gender_and_year = filtered_df.groupby(['year','sex'])['suicides/100k pop'].mean()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_gender_and_year)
print()
print()
print()
print()
average_suicide_rates_by_age = filtered_df.groupby('age')['suicides/100k pop'].mean().sort_values(ascending=False)
print("average_suicide_rates_by_age = filtered_df.groupby('age')['suicides/100k pop'].mean().sort_values(ascending=False)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_age)
print()
print()
print()
print()
average_suicide_rates_by_age_and_year = filtered_df.groupby(['year','age'])['suicides/100k pop'].mean()
#average_suicide_rates_by_age_and_year.to_csv('average_suicide_rates_by_age_and_year.csv', index=True)
print("average_suicide_rates_by_age_and_year = filtered_df.groupby(['year','age'])['suicides/100k pop'].mean().sort_values(ascending=False)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_age_and_year)
print()
print()
print()
print()
#calculate the correlation coefficients
correlations = filtered_df[['suicides/100k pop', 'gdp_per_capita ($)', ' gdp_for_year ($) ']].corr()
print("correlations = filtered_df[['suicides/100k pop', 'gdp_per_capita ($)', ' gdp_for_year ($) ']].corr()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(correlations)
print()
print()
print()
print()
average_suicide_rates_by_generation = filtered_df.groupby('generation')['suicides/100k pop'].mean().sort_values(ascending=False)
print("average_suicide_rates_by_generation = filtered_df.groupby('generation')['suicides/100k pop'].mean().sort_values(ascending=False)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_generation)
print()
print()
print()
print()
average_suicide_rates_by_generation_and_year = filtered_df.groupby(['year', 'generation'])['suicides/100k pop'].mean()
#average_suicide_rates_by_generation_and_year.to_csv('average_suicide_rates_by_generation_and_year.csv', index=True)
print("average_suicide_rates_by_generation_and_year = filtered_df.groupby(['year', 'generation'])['suicides/100k pop'].mean().sort_values(ascending=False)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_generation_and_year)
print()
print()
print()
print()
average_suicide_rates_by_year = filtered_df.groupby('year')['suicides/100k pop'].mean()
print("average_suicide_rates_by_year = filtered_df.groupby('year')['suicides/100k pop'].mean()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_suicide_rates_by_year)
print()
print()
print()
print()
prevalence_by_country = df2_filtered.groupby(['Entity', 'Year'])['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()
print("prevalence_by_country = df2_filtered.groupby(['Entity', 'Year'])['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(prevalence_by_country)
print()
print()
print()
print()
average_prevalence_by_country = df2_filtered.groupby('Entity')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()
print("average_prevalence_by_country = df2_filtered.groupby('Entity')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_prevalence_by_country)
print()
print()
print()
print()
mean_prevalence = df2_filtered.groupby('Year')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()
print("mean_prevalence = df2_filtered.groupby('Year')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(mean_prevalence)
print()
print()
print()
print()
median_prevalence = df2_filtered.groupby('Year')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].median()
print("median_prevalence = df2_filtered.groupby('Year')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].median()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(median_prevalence)
print()
print()
print()
print()
total_gdp = df3_filtered.iloc[:, 2:].sum(axis=0)
print("total_gdp = df3_filtered.iloc[:, 2:].sum(axis=0)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(total_gdp)
print()
print()
print()
print()
average_gdp = df3_filtered.iloc[:, 2:].mean(axis=0)
print("average_gdp = df3_filtered.iloc[:, 2:].mean(axis=0)")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(average_gdp)
print()
print()
print()
print()
mean_unemployment = df4_filtered.groupby('TIME')['Value'].mean()
print("mean_unemployment = df4_filtered.groupby('TIME')['Value'].mean()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(mean_unemployment)
print()
print()
print()
print()
median_unemployment = df4_filtered.groupby('TIME')['Value'].median()
print("median_unemployment = df4_filtered.groupby('TIME')['Value'].median()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(median_unemployment)
print()
print()
print()
print()
# Rename columns in df2_filtered, df3_filtered, and df4_filtered to match the common columns in filtered_df
df2_filtered.rename(columns={'Entity': 'country', 'Year': 'year'}, inplace=True)
df3_filtered.rename(columns={'Country Name': 'country', 'Country Code': 'country_code'}, inplace=True)
df4_filtered.rename(columns={'LOCATION': 'country_code', 'TIME': 'year', 'Value': 'value'}, inplace=True)
filtered_df.to_csv('filtered_df.csv', index=False)
df2_filtered.to_csv('df2_filtered.csv', index=False)
df3_filtered.to_csv('df3_filtered.csv', index=False)
df4_filtered.to_csv('df4_filtered.csv', index=False)
# Merge filtered_df with df2_filtered
merged_df = pd.merge(filtered_df, df2_filtered, on=['country', 'year'], how='left')#first merge
merged_df.to_csv('merged_df.csv', index=False)
print("merged_df = filtered_df.merge(df2_filtered, on=['country', 'year'], how='left')")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(merged_df)
print()
print()
print()
print()
# Merge the resulting DataFrame with df3_filtered
# Note: df3_filtered has data in wide format, with years as columns. We need to convert it to long format before merging
df3_long = pd.melt(df3_filtered, id_vars=['country', 'country_code'], var_name='year', value_name='gdp')
# Convert 'year' column to integer type
df3_long['year'] = df3_long['year'].astype(int)
df3_long.to_csv('df3_long.csv', index=False)
merged_df = pd.merge(merged_df, df3_long, on=['country', 'year'], how='left')#second merge
merged_df.to_csv('merged_df2.csv', index=False)
print("merged_df = merged_df.merge(df3_long, on=['country', 'year'], how='left')")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(merged_df)
print()
print()
print()
print()
# Merge the resulting DataFrame with df4_filtered
merged_df = pd.merge(merged_df, df4_filtered, left_on=['country_code', 'year'], right_on=['country_code', 'year'], how='left')#last merge
merged_df.dropna(inplace=True)
merged_df = merged_df.drop('country-year',axis=1)
merged_df = merged_df.drop('HDI for year',axis=1)
merged_df = merged_df.drop('gdp',axis=1)
merged_df = merged_df.drop('country_code',axis=1)
merged_df.to_csv('merged_df3.csv', index=False)
print("merged_df = merged_df.merge(df4_filtered, on=['country', 'year'], how='left')")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(merged_df)
print()
print()
print()
print()
print("merged_df.isnull().sum()")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(merged_df.isnull().sum())
print()
print()
print()
print()
print("merged_df.describe(include='all')")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(merged_df.describe(include='all'))
print()
print()
print()
print()
grouped_data = merged_df.groupby(['country', 'year']).agg({
'suicides_no': 'sum',
'population': 'sum',
' gdp_for_year ($) ': 'first',
'gdp_per_capita ($)': 'first',
'Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)': 'first',
'value': 'first'
}).reset_index()
grouped_data = grouped_data.rename(columns={"Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)": "Anxiety Prevalence (%)"})
grouped_data = grouped_data.rename(columns={"value": "Unemployment Rate(%)"})
# Calculate the new 'suicides/100k pop' rate for each group
grouped_data['suicides/100k pop'] = (grouped_data['suicides_no'] / grouped_data['population']) * 100000
# Write the result to a new CSV file
grouped_data.to_csv('transformed_data.csv', index=False)
print("grouped_data = merged_df.groupby(['country', 'year']).agg")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(grouped_data)
print()
print()
print()
print()
print("grouped_data.describe(include='all')")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(grouped_data.describe(include='all'))
print()
print()
print()
print()
tempDef2 = df4_filtered[(df4_filtered['country_code'] == 'USA')]
npArray = tempDef2.to_numpy()
npArray1 = []
for x in npArray:
npArray1.append(x[2])
npArray1 = np.array(npArray1)
npArray1
Mean = np.mean(npArray1)
Median = np.median(npArray1)
Std = np.std(npArray1)
q1 = np.percentile(npArray1,25)
q3 = np.percentile(npArray1, 75)
IQR = q3 - q1
data = {}
data['Mean'] = Mean
data['Median'] = Median
data['Std'] = Std
data['IQR'] = IQR
Stats = list(data.keys())
values = list(data.values())
fig = plt.figure(figsize= (10,5))
plt.bar(Stats,values,color = 'blue',width = 0.4)
# - - - - - - - - - - - - - - - - - - - - - - - - - -
plt.title("Unemployment")
plt.ylabel('Value')
plt.xlabel('Statistic')
plt.show()
###############################################################################################
#check for NaN or null values
values = df3_filtered.isna()
count = values.sum()
usa_data = df3_filtered.loc[df3_filtered['country'] == 'United States', '2000':'2015']
usa_data = np.array(usa_data)
# for x in usa_data:
# print(x)
Mean = np.mean(usa_data)
Mean1 = np.round(Mean,2)
print(Mean1)
Median = np.median(usa_data)
Median1 = np.round(Median,2)
print(Median1)
std = np.std(usa_data)
std1 = np.round(std,2)
print(std1)
q1 = np.percentile(usa_data,25)
q3 = np.percentile(usa_data, 75)
IQR = q3 - q1
IQR1 = np.round(IQR,2)
print(IQR1)
data = {}
data['Mean'] = Mean1
data['Median'] = Median1
data['Std'] = std1
data['IQR'] = IQR1
Stats = list(data.keys())
values = list(data.values())
fig = plt.figure(figsize= (10,5))
plt.bar(Stats,values,color = 'blue',width = 0.4)
# - - - - - - - - - - - - - - - - - - - - - - - - - -
plt.title("GDP")
plt.ylabel('Value')
plt.xlabel('Statistic')
plt.show()
#####################################################################################################################
values = df2_filtered.isna()
count = values.sum()
usa_data = df2_filtered.loc[(df2_filtered['country'] == 'United States')]
#print(usa_data)
#---------------------------------------------------
anxiety_disorders = usa_data['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)']
stat = np.round(anxiety_disorders.describe(), 2)
median = np.median(stat)
print(stat)
print(median)
#------------------------------------------------------------
# Create a box plot to visualize the statistics
# Create a bar graph to visualize the statistics
fig, ax = plt.subplots()
ax.bar(stat.index, stat.values)
ax.set_title('Descriptive statistics for anxiety disorders(USA)')
ax.set_xlabel('Statistic')
ax.set_ylabel('Value')
plt.show()
#####################################################################################################################
#check for NaN or null values
values = filtered_df.isna()
count = values.sum()
#print(count)
#delete NaN columms
filtered_df = filtered_df.dropna(axis=1, how='any')
drop_column= [' gdp_for_year ($) ', 'gdp_per_capita ($)', 'country-year']
filtered_df = filtered_df.drop(drop_column, axis=1)
#print(df)
#-----------------------------------------------------
numbers_suicide = filtered_df['suicides_no']
stat = np.round(numbers_suicide.describe(), 2)
median = np.median(numbers_suicide)
print(stat)
print(median)
#------------------------------------------------------------
# Create a box plot to visualize the statistics
# Create a bar graph to visualize the statistics
fig, ax = plt.subplots()
ax.bar(stat.index, stat.values)
ax.set_title('Descriptive statistics for suicides_no')
ax.set_xlabel('Statistic')
ax.set_ylabel('Value')
plt.show()
###################################################################################################################
values = merged_df.isna()
count = values.sum()
Australia_data = merged_df.loc[(merged_df['country'] == 'Australia')]
#print(usa_data)
#---------------------------------------------------
anxiety_disorders = Australia_data['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)']
stat = np.round(anxiety_disorders.describe(), 2)
median = np.median(stat)
print(stat)
print(median)
#------------------------------------------------------------
# Create a box plot to visualize the statistics
# Create a bar graph to visualize the statistics
fig, ax = plt.subplots()
ax.bar(stat.index, stat.values)
ax.set_title('Descriptive statistics for anxiety disorders(Australia)')
ax.set_xlabel('Statistic')
ax.set_ylabel('Value')
plt.show()
# Create a figure with multiple subplots
fig, axs = plt.subplots(6, 2, figsize=(20, 30))
# Plot 1: Top N average suicide rates by country
top_n = 15
top_countries = average_suicide_rates_by_country.head(top_n)
axs[0, 0].barh(top_countries.index, top_countries.values)
axs[0, 0].invert_yaxis()
axs[0, 0].set_title(f'Top {top_n} Average Suicide Rates by Country')
axs[0, 0].set_xlabel('Suicides per 100k Population')
# Plot 2: Average suicide rates by gender
axs[0, 1].bar(average_suicide_rates_by_gender.index, average_suicide_rates_by_gender.values)
axs[0, 1].set_title('Average Suicide Rates by Gender')
axs[0, 1].set_ylabel('Suicides per 100k Population')
# Plot 3: Average suicide rates by age
axs[1, 0].bar(average_suicide_rates_by_age.index, average_suicide_rates_by_age.values)
axs[1, 0].set_title('Average Suicide Rates by Age')
axs[1, 0].set_ylabel('Suicides per 100k Population')
axs[1, 0].set_xticklabels(average_suicide_rates_by_age.index, rotation=45, ha='right')
# Plot 4: Average suicide rates by generation
# For average_suicide_rates_by_generation_and_year
avg_suicide_rates_by_gen_and_year = average_suicide_rates_by_generation_and_year.reset_index()
avg_suicide_rates_by_gen_and_year['year'] = pd.to_numeric(avg_suicide_rates_by_gen_and_year['year'])
axs[1, 1].bar(average_suicide_rates_by_generation.index, average_suicide_rates_by_generation.values)
axs[1, 1].set_title('Average Suicide Rates by Generation')
axs[1, 1].set_ylabel('Suicides per 100k Population')
axs[1, 1].set_xticklabels(average_suicide_rates_by_generation.index, rotation=45, ha='right')
# Plot 5: Suicide rates over time
axs[2, 0].plot(average_suicide_rates_by_year.index, average_suicide_rates_by_year.values)
axs[2, 0].set_xlabel('Year')
axs[2, 0].set_ylabel('Average Suicide Rate per 100k Population')
axs[2, 0].set_title('Suicide Rates Over Time')
# Plot 6: Average suicide rates by country and year
top_n_year = 5
# Find the top 15 countries based on average suicide rates
top_n_countries = average_suicide_rates_by_country_and_year.groupby('country').mean().nlargest(top_n_year).index
# Filter average_suicide_rates_by_country_and_year to only include the top 15 countries
top_n_average_suicide_rates_by_country_and_year = average_suicide_rates_by_country_and_year[average_suicide_rates_by_country_and_year.index.get_level_values('country').isin(top_n_countries)]
# For average_suicide_rates_by_country_and_year (Top 15 countries)
top_n_average_suicide_rates_by_country_and_year = top_n_average_suicide_rates_by_country_and_year.reset_index()
top_n_average_suicide_rates_by_country_and_year['year'] = pd.to_numeric(top_n_average_suicide_rates_by_country_and_year['year'])
for country, data in top_n_average_suicide_rates_by_country_and_year.groupby('country'):
axs[2, 1].plot(data['year'], data['suicides/100k pop'], label=country)
axs[2, 1].set_xlabel('Year')
axs[2, 1].set_ylabel('Suicides per 100k Population')
axs[2, 1].set_title('Average Suicide Rates by Country and Year (Top 5 Countries)')
axs[2, 1].legend(title='Country', bbox_to_anchor=(1, 1))
# Plot 7: Average suicide rates by gender and year
average_suicide_rates_by_gender_and_year.unstack().plot(ax=axs[3, 0])
axs[3, 0].set_ylabel('Suicides per 100k Population')
axs[3, 0].set_title('Average Suicide Rates by Gender and Year')
axs[3, 0].legend(title='Gender', loc='upper right')
# Plot 8: Average suicide rates by age and year
average_suicide_rates_by_age_and_year.unstack().plot(ax=axs[3, 1])
axs[3, 1].set_ylabel('Suicides per 100k Population')
axs[3, 1].set_title('Average Suicide Rates by Age and Year')
axs[3, 1].legend(title='Age Group', loc='upper right')
#Plot 9: Top N average anxiety disorders prevalence by country
top_n = 15
top_n_prevalence = average_prevalence_by_country.sort_values(ascending=False).head(top_n)
axs[4, 0].barh(top_n_prevalence.index, top_n_prevalence.values)
axs[4, 0].invert_yaxis()
axs[4, 0].set_xlabel('Anxiety Disorders Prevalence (%)')
axs[4, 0].set_title('Top 15 Average Anxiety Disorders Prevalence by Country')
#Plot 10: Anxiety disorders prevalence over time
axs[4, 1].plot(mean_prevalence.index, mean_prevalence.values, label='Mean Prevalence')
axs[4, 1].plot(median_prevalence.index, median_prevalence.values, label='Median Prevalence')
axs[4, 1].set_xlabel('Year')
axs[4, 1].set_ylabel('Prevalence of Anxiety Disorders (%)')
axs[4, 1].set_title('Anxiety Disorders Prevalence Over Time')
axs[4, 1].legend()
#Plot 11: Average GDP over time
axs[5, 0].bar(average_gdp.index, average_gdp.values, label='Average GDP')
axs[5, 0].set_xlabel('Year')
axs[5, 0].set_ylabel('GDP (USD)')
axs[5, 0].set_title('Average GDP Over Time')
axs[5, 0].legend()
#Plot 12: Unemployment rate over time
axs[5, 1].plot(mean_unemployment.index, mean_unemployment.values, label='Mean Unemployment Rate')
axs[5, 1].plot(median_unemployment.index, median_unemployment.values, label='Median Unemployment Rate')
axs[5, 1].set_xlabel('Year')
axs[5, 1].set_ylabel('Unemployment Rate (%)')
axs[5, 1].set_title('Unemployment Rate Over Time')
axs[5, 1].legend()
#Adjust layout and display the plots
fig.tight_layout()
plt.show()
df.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
country 0
year 0
sex 0
age 0
suicides_no 0
population 0
suicides/100k pop 0
country-year 0
HDI for year 19456
gdp_for_year ($) 0
gdp_per_capita ($) 0
generation 0
dtype: int64
df2.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Entity 0
Code 690
Year 0
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent) 0
dtype: int64
df3.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Country Name Country Code 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
df4.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
"LOCATION" 0
INDICATOR 0
SUBJECT 0
MEASURE 0
FREQUENCY 0
TIME 0
Value 0
Flag Codes 586
dtype: int64
df
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year sex age suicides_no population \
0 Albania 1987 male 15-24 years 21 312900
1 Albania 1987 male 35-54 years 16 308000
2 Albania 1987 female 15-24 years 14 289700
3 Albania 1987 male 75+ years 1 21800
4 Albania 1987 male 25-34 years 9 274300
... ... ... ... ... ... ...
27815 Uzbekistan 2014 female 35-54 years 107 3620833
27816 Uzbekistan 2014 female 75+ years 9 348465
27817 Uzbekistan 2014 male 5-14 years 60 2762158
27818 Uzbekistan 2014 female 5-14 years 44 2631600
27819 Uzbekistan 2014 female 55-74 years 21 1438935
suicides/100k pop country-year HDI for year gdp_for_year ($) \
0 6.71 Albania1987 NaN 2,156,624,900
1 5.19 Albania1987 NaN 2,156,624,900
2 4.83 Albania1987 NaN 2,156,624,900
3 4.59 Albania1987 NaN 2,156,624,900
4 3.28 Albania1987 NaN 2,156,624,900
... ... ... ... ...
27815 2.96 Uzbekistan2014 0.675 63,067,077,179
27816 2.58 Uzbekistan2014 0.675 63,067,077,179
27817 2.17 Uzbekistan2014 0.675 63,067,077,179
27818 1.67 Uzbekistan2014 0.675 63,067,077,179
27819 1.46 Uzbekistan2014 0.675 63,067,077,179
gdp_per_capita ($) generation
0 796 Generation X
1 796 Silent
2 796 Generation X
3 796 G.I. Generation
4 796 Boomers
... ... ...
27815 2309 Generation X
27816 2309 Silent
27817 2309 Generation Z
27818 2309 Generation Z
27819 2309 Boomers
[27820 rows x 12 columns]
df2
-------------------------------------------------------------------------------------------------------------------------------------------------------
Entity Code Year \
0 Afghanistan AFG 1990
1 Afghanistan AFG 1991
2 Afghanistan AFG 1992
3 Afghanistan AFG 1993
4 Afghanistan AFG 1994
... ... ... ...
6835 Zimbabwe ZWE 2015
6836 Zimbabwe ZWE 2016
6837 Zimbabwe ZWE 2017
6838 Zimbabwe ZWE 2018
6839 Zimbabwe ZWE 2019
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)
0 4.84
1 4.82
2 4.80
3 4.79
4 4.78
... ...
6835 3.32
6836 3.32
6837 3.33
6838 3.32
6839 3.28
[6840 rows x 4 columns]
df3
-------------------------------------------------------------------------------------------------------------------------------------------------------
Country Name Country Code 1960 1961 \
0 Africa Eastern and Southern AFE 1.931311e+10 1.972349e+10
1 Africa Western and Central AFW 1.040428e+10 1.112805e+10
2 Australia AUS 1.860679e+10 1.968306e+10
3 Austria AUT 6.592694e+09 7.311750e+09
4 Burundi BDI 1.960000e+08 2.030000e+08
.. ... ... ... ...
115 St. Vincent and the Grenadines VCT 1.306656e+07 1.399988e+07
116 World WLD 1.390000e+12 1.440000e+12
117 South Africa ZAF 7.575397e+09 7.972997e+09
118 Zambia ZMB 7.130000e+08 6.962857e+08
119 Zimbabwe ZWE 1.052990e+09 1.096647e+09
1962 1963 1964 1965 1966 \
0 2.149392e+10 2.573321e+10 2.352744e+10 2.681057e+10 2.915216e+10
1 1.194335e+10 1.267652e+10 1.383858e+10 1.486247e+10 1.583285e+10
2 1.992272e+10 2.153993e+10 2.380110e+10 2.597715e+10 2.730989e+10
3 7.756110e+09 8.374175e+09 9.169984e+09 9.994071e+09 1.088768e+10
4 2.135000e+08 2.327500e+08 2.607500e+08 1.589950e+08 1.654446e+08
.. ... ... ... ... ...
115 1.452488e+07 1.370822e+07 1.475821e+07 1.510821e+07 1.609987e+07
116 1.550000e+12 1.670000e+12 1.820000e+12 1.990000e+12 2.160000e+12
117 8.497997e+09 9.423396e+09 1.037400e+10 1.133440e+10 1.235500e+10
118 6.931429e+08 7.187143e+08 8.394286e+08 1.082857e+09 1.264286e+09
119 1.117602e+09 1.159512e+09 1.217138e+09 1.311436e+09 1.281750e+09
1967 ... 2011 2012 2013 \
0 3.017317e+10 ... 9.430000e+11 9.510000e+11 9.640000e+11
1 1.442643e+10 ... 6.710000e+11 7.280000e+11 8.210000e+11
2 3.044462e+10 ... 1.400000e+12 1.550000e+12 1.580000e+12
3 1.157943e+10 ... 4.310000e+11 4.090000e+11 4.300000e+11
4 1.782971e+08 ... 2.235821e+09 2.333308e+09 2.451625e+09
.. ... ... ... ... ...
115 1.583518e+07 ... 6.761296e+08 6.929333e+08 7.212074e+08
116 2.290000e+12 ... 7.370000e+13 7.530000e+13 7.740000e+13
117 1.377739e+10 ... 4.580000e+11 4.340000e+11 4.010000e+11
118 1.368000e+09 ... 2.345952e+10 2.550306e+10 2.803724e+10
119 1.397002e+09 ... 1.410192e+10 1.711485e+10 1.909102e+10
2014 2015 2016 2017 2018 \
0 9.850000e+11 9.200000e+11 8.730000e+11 9.850000e+11 1.010000e+12
1 8.650000e+11 7.610000e+11 6.910000e+11 6.840000e+11 7.420000e+11
2 1.470000e+12 1.350000e+12 1.210000e+12 1.330000e+12 1.430000e+12
3 4.420000e+11 3.820000e+11 3.960000e+11 4.160000e+11 4.550000e+11
4 2.705783e+09 3.104395e+09 2.732809e+09 2.748180e+09 2.668496e+09
.. ... ... ... ... ...
115 7.277148e+08 7.554000e+08 7.744296e+08 7.921778e+08 8.113000e+08
116 7.960000e+13 7.510000e+13 7.630000e+13 8.120000e+13 8.630000e+13
117 3.810000e+11 3.470000e+11 3.240000e+11 3.810000e+11 4.050000e+11
118 2.714102e+10 2.125122e+10 2.095841e+10 2.587360e+10 2.631159e+10
119 1.949552e+10 1.996312e+10 2.054868e+10 1.758489e+10 1.811554e+10
2019 2020
0 1.010000e+12 9.210000e+11
1 7.950000e+11 7.850000e+11
2 1.390000e+12 1.330000e+12
3 4.450000e+11 4.330000e+11
4 2.631434e+09 2.841786e+09
.. ... ...
115 8.250407e+08 8.074741e+08
116 8.760000e+13 8.470000e+13
117 3.880000e+11 3.350000e+11
118 2.330867e+10 1.811063e+10
119 1.928429e+10 1.805117e+10
[120 rows x 63 columns]
df4
-------------------------------------------------------------------------------------------------------------------------------------------------------
"LOCATION" INDICATOR SUBJECT MEASURE FREQUENCY TIME Value \
0 AUS HUR TOT PC_LF A 2000 6.285546
1 AUS HUR TOT PC_LF A 2001 6.742173
2 AUS HUR TOT PC_LF A 2002 6.368911
3 AUS HUR TOT PC_LF A 2003 5.928420
4 AUS HUR TOT PC_LF A 2004 5.396734
.. ... ... ... ... ... ... ...
626 CRI HUR TOT PC_LF A 2011 10.298480
627 CRI HUR TOT PC_LF A 2012 10.171750
628 CRI HUR TOT PC_LF A 2013 9.386163
629 CRI HUR TOT PC_LF A 2014 9.617385
630 CRI HUR TOT PC_LF A 2015 9.612973
Flag Codes
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
.. ...
626 NaN
627 NaN
628 NaN
629 NaN
630 NaN
[631 rows x 8 columns]
filtered_df.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
country 0
year 0
sex 0
age 0
suicides_no 0
population 0
suicides/100k pop 0
country-year 0
HDI for year 0
gdp_for_year ($) 0
gdp_per_capita ($) 0
generation 0
dtype: int64
df2_filtered.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Entity 0
Year 0
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent) 0
dtype: int64
df3_filtered.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Country Name Country Code 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
df4_filtered.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
LOCATION 0
TIME 0
Value 0
dtype: int64
filtered_df
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year sex age suicides_no population \
132 Albania 2000 male 25-34 years 17 232000
133 Albania 2000 male 55-74 years 10 177400
134 Albania 2000 female 75+ years 2 37800
135 Albania 2000 male 75+ years 1 24900
136 Albania 2000 female 15-24 years 6 263900
... ... ... ... ... ... ...
27815 Uzbekistan 2014 female 35-54 years 107 3620833
27816 Uzbekistan 2014 female 75+ years 9 348465
27817 Uzbekistan 2014 male 5-14 years 60 2762158
27818 Uzbekistan 2014 female 5-14 years 44 2631600
27819 Uzbekistan 2014 female 55-74 years 21 1438935
suicides/100k pop country-year HDI for year gdp_for_year ($) \
132 7.33 Albania2000 0.656 3.632044e+09
133 5.64 Albania2000 0.656 3.632044e+09
134 5.29 Albania2000 0.656 3.632044e+09
135 4.02 Albania2000 0.656 3.632044e+09
136 2.27 Albania2000 0.656 3.632044e+09
... ... ... ... ...
27815 2.96 Uzbekistan2014 0.675 6.306708e+10
27816 2.58 Uzbekistan2014 0.675 6.306708e+10
27817 2.17 Uzbekistan2014 0.675 6.306708e+10
27818 1.67 Uzbekistan2014 0.675 6.306708e+10
27819 1.46 Uzbekistan2014 0.675 6.306708e+10
gdp_per_capita ($) generation
132 1299 Generation X
133 1299 Silent
134 1299 G.I. Generation
135 1299 G.I. Generation
136 1299 Generation X
... ... ...
27815 2309 Generation X
27816 2309 Silent
27817 2309 Generation Z
27818 2309 Generation Z
27819 2309 Boomers
[16008 rows x 12 columns]
df2_filtered
-------------------------------------------------------------------------------------------------------------------------------------------------------
Entity Year \
10 Afghanistan 2000
11 Afghanistan 2001
12 Afghanistan 2002
13 Afghanistan 2003
14 Afghanistan 2004
... ... ...
6831 Zimbabwe 2011
6832 Zimbabwe 2012
6833 Zimbabwe 2013
6834 Zimbabwe 2014
6835 Zimbabwe 2015
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)
10 4.79
11 4.79
12 4.79
13 4.79
14 4.79
... ...
6831 3.29
6832 3.30
6833 3.30
6834 3.31
6835 3.32
[3648 rows x 3 columns]
df3_filtered
-------------------------------------------------------------------------------------------------------------------------------------------------------
Country Name Country Code 2000 2001 \
0 Africa Eastern and Southern AFE 2.840000e+11 2.590000e+11
1 Africa Western and Central AFW 1.400000e+11 1.480000e+11
2 Australia AUS 4.160000e+11 3.790000e+11
3 Austria AUT 1.970000e+11 1.970000e+11
4 Burundi BDI 8.704861e+08 8.767947e+08
.. ... ... ... ...
115 St. Vincent and the Grenadines VCT 3.962630e+08 4.300407e+08
116 World WLD 3.380000e+13 3.360000e+13
117 South Africa ZAF 1.520000e+11 1.350000e+11
118 Zambia ZMB 3.600683e+09 4.094481e+09
119 Zimbabwe ZWE 6.689958e+09 6.777385e+09
2002 2003 2004 2005 2006 \
0 2.650000e+11 3.530000e+11 4.390000e+11 5.120000e+11 5.760000e+11
1 1.770000e+11 2.050000e+11 2.540000e+11 3.110000e+11 3.930000e+11
2 3.950000e+11 4.670000e+11 6.140000e+11 6.950000e+11 7.480000e+11
3 2.130000e+11 2.620000e+11 3.010000e+11 3.160000e+11 3.360000e+11
4 8.253945e+08 7.846544e+08 9.152573e+08 1.117113e+09 1.273375e+09
.. ... ... ... ... ...
115 4.618852e+08 4.818074e+08 5.219741e+08 5.507296e+08 6.109296e+08
116 3.490000e+13 3.910000e+13 4.410000e+13 4.780000e+13 5.180000e+13
117 1.290000e+11 1.970000e+11 2.560000e+11 2.890000e+11 3.040000e+11
118 4.193846e+09 4.901840e+09 6.221078e+09 8.331870e+09 1.275686e+10
119 6.342116e+09 5.727592e+09 5.805598e+09 5.755215e+09 5.443896e+09
2007 2008 2009 2010 2011 \
0 6.610000e+11 7.080000e+11 7.130000e+11 8.470000e+11 9.430000e+11
1 4.620000e+11 5.660000e+11 5.070000e+11 5.920000e+11 6.710000e+11
2 8.540000e+11 1.060000e+12 9.280000e+11 1.150000e+12 1.400000e+12
3 3.890000e+11 4.300000e+11 4.000000e+11 3.920000e+11 4.310000e+11
4 1.356199e+09 1.611836e+09 1.781455e+09 2.032135e+09 2.235821e+09
.. ... ... ... ... ...
115 6.844444e+08 6.954296e+08 6.749222e+08 6.812259e+08 6.761296e+08
116 5.830000e+13 6.400000e+13 6.070000e+13 6.650000e+13 7.370000e+13
117 3.330000e+11 3.160000e+11 3.300000e+11 4.170000e+11 4.580000e+11
118 1.405696e+10 1.791086e+10 1.532834e+10 2.026556e+10 2.345952e+10
119 5.291950e+09 4.415703e+09 9.665793e+09 1.204166e+10 1.410192e+10
2012 2013 2014 2015
0 9.510000e+11 9.640000e+11 9.850000e+11 9.200000e+11
1 7.280000e+11 8.210000e+11 8.650000e+11 7.610000e+11
2 1.550000e+12 1.580000e+12 1.470000e+12 1.350000e+12
3 4.090000e+11 4.300000e+11 4.420000e+11 3.820000e+11
4 2.333308e+09 2.451625e+09 2.705783e+09 3.104395e+09
.. ... ... ... ...
115 6.929333e+08 7.212074e+08 7.277148e+08 7.554000e+08
116 7.530000e+13 7.740000e+13 7.960000e+13 7.510000e+13
117 4.340000e+11 4.010000e+11 3.810000e+11 3.470000e+11
118 2.550306e+10 2.803724e+10 2.714102e+10 2.125122e+10
119 1.711485e+10 1.909102e+10 1.949552e+10 1.996312e+10
[120 rows x 18 columns]
df4_filtered
-------------------------------------------------------------------------------------------------------------------------------------------------------
LOCATION TIME Value
0 AUS 2000 6.285546
1 AUS 2001 6.742173
2 AUS 2002 6.368911
3 AUS 2003 5.928420
4 AUS 2004 5.396734
.. ... ... ...
626 CRI 2011 10.298480
627 CRI 2012 10.171750
628 CRI 2013 9.386163
629 CRI 2014 9.617385
630 CRI 2015 9.612973
[631 rows x 3 columns]
average_suicide_rates_by_country = filtered_df.groupby('country')['suicides/100k pop'].mean().sort_values(ascending=False)
-------------------------------------------------------------------------------------------------------------------------------------------------------
country
Lithuania 38.015208
Republic of Korea 35.543646
Russian Federation 31.338229
Guyana 30.191667
Sri Lanka 30.104000
...
Antigua and Barbuda 0.874405
Barbados 0.834881
Oman 0.736111
Jamaica 0.688583
Kiribati 0.000000
Name: suicides/100k pop, Length: 97, dtype: float64
average_suicide_rates_by_country_and_year = filtered_df.groupby(['year','country'])['suicides/100k pop'].mean().sort_values(ascending=False)
-------------------------------------------------------------------------------------------------------------------------------------------------------
year country
2000 Albania 2.558333
Antigua and Barbuda 3.330833
Argentina 10.949167
Armenia 2.858333
Aruba 25.444167
...
2015 Turkmenistan 2.373333
Ukraine 20.393333
United Kingdom 7.228333
United States 14.617500
Uruguay 22.501667
Name: suicides/100k pop, Length: 1334, dtype: float64
average_suicide_rates_by_gender = filtered_df.groupby('sex')['suicides/100k pop'].mean()
-------------------------------------------------------------------------------------------------------------------------------------------------------
sex
female 4.901246
male 19.299390
Name: suicides/100k pop, dtype: float64
average_suicide_rates_by_gender_and_year = filtered_df.groupby(['year','sex'])['suicides/100k pop'].mean()
-------------------------------------------------------------------------------------------------------------------------------------------------------
year sex
2000 female 5.783004
male 22.099651
2001 female 5.491932
male 21.546345
2002 female 5.606105
male 21.966996
2003 female 5.302384
male 21.107655
2004 female 5.054127
male 19.909762
2005 female 5.007599
male 19.129286
2006 female 4.769059
male 19.085863
2007 female 4.994012
male 19.056667
2008 female 4.979412
male 18.835961
2009 female 4.528240
male 18.273333
2010 female 4.406117
male 18.025682
2011 female 4.344322
male 17.686376
2012 female 4.521626
male 18.321811
2013 female 4.366208
male 17.849708
2014 female 4.458803
male 17.564124
2015 female 4.653468
male 17.534677
Name: suicides/100k pop, dtype: float64
average_suicide_rates_by_age = filtered_df.groupby('age')['suicides/100k pop'].mean().sort_values(ascending=False)
-------------------------------------------------------------------------------------------------------------------------------------------------------
age
75+ years 22.080315
55-74 years 15.342031
35-54 years 14.420795
25-34 years 11.625240
15-24 years 8.504663
5-14 years 0.628864
Name: suicides/100k pop, dtype: float64
average_suicide_rates_by_age_and_year = filtered_df.groupby(['year','age'])['suicides/100k pop'].mean().sort_values(ascending=False)
-------------------------------------------------------------------------------------------------------------------------------------------------------
year age
2000 15-24 years 9.442733
25-34 years 13.756919
35-54 years 16.443372
5-14 years 0.532442
55-74 years 17.531279
...
2015 25-34 years 10.111774
35-54 years 12.555645
5-14 years 0.704677
55-74 years 14.451774
75+ years 20.967339
Name: suicides/100k pop, Length: 96, dtype: float64
correlations = filtered_df[['suicides/100k pop', 'gdp_per_capita ($)', ' gdp_for_year ($) ']].corr()
-------------------------------------------------------------------------------------------------------------------------------------------------------
suicides/100k pop gdp_per_capita ($) gdp_for_year ($)
suicides/100k pop 1.000000 -0.010388 0.026922
gdp_per_capita ($) -0.010388 1.000000 0.271639
gdp_for_year ($) 0.026922 0.271639 1.000000
average_suicide_rates_by_generation = filtered_df.groupby('generation')['suicides/100k pop'].mean().sort_values(ascending=False)
-------------------------------------------------------------------------------------------------------------------------------------------------------
generation
G.I. Generation 25.941221
Silent 19.236765
Boomers 14.789173
Generation X 12.256906
Millenials 6.720002
Generation Z 0.642299
Name: suicides/100k pop, dtype: float64
average_suicide_rates_by_generation_and_year = filtered_df.groupby(['year', 'generation'])['suicides/100k pop'].mean().sort_values(ascending=False)
-------------------------------------------------------------------------------------------------------------------------------------------------------
year generation
2000 Boomers 16.443372
G.I. Generation 25.941221
Generation X 11.599826
Millenials 0.532442
Silent 17.531279
...
2015 Boomers 14.451774
Generation X 12.555645
Generation Z 0.704677
Millenials 8.942500
Silent 20.967339
Name: suicides/100k pop, Length: 73, dtype: float64
average_suicide_rates_by_year = filtered_df.groupby('year')['suicides/100k pop'].mean()
-------------------------------------------------------------------------------------------------------------------------------------------------------
year
2000 13.941328
2001 13.519138
2002 13.786550
2003 13.205019
2004 12.481944
2005 12.068442
2006 11.927461
2007 12.025339
2008 11.907686
2009 11.400787
2010 11.215900
2011 11.015349
2012 11.421718
2013 11.107958
2014 11.011464
2015 11.094073
Name: suicides/100k pop, dtype: float64
prevalence_by_country = df2_filtered.groupby(['Entity', 'Year'])['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Entity Year
Afghanistan 2000 4.79
2001 4.79
2002 4.79
2003 4.79
2004 4.79
...
Zimbabwe 2011 3.29
2012 3.30
2013 3.30
2014 3.31
2015 3.32
Name: Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent), Length: 3648, dtype: float64
average_prevalence_by_country = df2_filtered.groupby('Entity')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Entity
Afghanistan 4.841875
African Region (WHO) 3.537500
Albania 3.990625
Algeria 4.918125
American Samoa 4.193750
...
World Bank Lower Middle Income 3.293125
World Bank Upper Middle Income 4.211875
Yemen 4.914375
Zambia 3.881250
Zimbabwe 3.303125
Name: Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent), Length: 228, dtype: float64
mean_prevalence = df2_filtered.groupby('Year')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].mean()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Year
2000 4.305789
2001 4.309298
2002 4.311798
2003 4.313684
2004 4.315526
2005 4.317544
2006 4.323596
2007 4.334781
2008 4.348816
2009 4.361491
2010 4.369123
2011 4.373772
2012 4.378772
2013 4.383860
2014 4.389649
2015 4.394605
Name: Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent), dtype: float64
median_prevalence = df2_filtered.groupby('Year')['Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)'].median()
-------------------------------------------------------------------------------------------------------------------------------------------------------
Year
2000 4.055
2001 4.055
2002 4.055
2003 4.055
2004 4.055
2005 4.055
2006 4.070
2007 4.090
2008 4.115
2009 4.130
2010 4.140
2011 4.150
2012 4.160
2013 4.170
2014 4.180
2015 4.180
Name: Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent), dtype: float64
total_gdp = df3_filtered.iloc[:, 2:].sum(axis=0)
-------------------------------------------------------------------------------------------------------------------------------------------------------
2000 2.160246e+14
2001 2.151838e+14
2002 2.233870e+14
2003 2.505006e+14
2004 2.831420e+14
2005 3.095809e+14
2006 3.389768e+14
2007 3.857718e+14
2008 4.279962e+14
2009 4.111343e+14
2010 4.573194e+14
2011 5.119564e+14
2012 5.284604e+14
2013 5.477950e+14
2014 5.653896e+14
2015 5.399171e+14
dtype: float64
average_gdp = df3_filtered.iloc[:, 2:].mean(axis=0)
-------------------------------------------------------------------------------------------------------------------------------------------------------
2000 1.800205e+12
2001 1.793198e+12
2002 1.861558e+12
2003 2.087505e+12
2004 2.359517e+12
2005 2.579841e+12
2006 2.824807e+12
2007 3.214765e+12
2008 3.566635e+12
2009 3.426120e+12
2010 3.810995e+12
2011 4.266303e+12
2012 4.403837e+12
2013 4.564958e+12
2014 4.711580e+12
2015 4.499309e+12
dtype: float64
mean_unemployment = df4_filtered.groupby('TIME')['Value'].mean()
-------------------------------------------------------------------------------------------------------------------------------------------------------
TIME
2000 7.901248
2001 7.765694
2002 7.832388
2003 7.879029
2004 7.936171
2005 7.560420
2006 6.812503
2007 6.196204
2008 6.307551
2009 8.764485
2010 9.282159
2011 8.937410
2012 9.230890
2013 9.234676
2014 8.744107
2015 8.165753
Name: Value, dtype: float64
median_unemployment = df4_filtered.groupby('TIME')['Value'].median()
-------------------------------------------------------------------------------------------------------------------------------------------------------
TIME
2000 6.779167
2001 6.666920
2002 6.886809
2003 7.575000
2004 7.375000
2005 7.783333
2006 6.466667
2007 5.966615
2008 6.239756
2009 8.219090
2010 8.375000
2011 8.045834
2012 8.020834
2013 8.200000
2014 7.456700
2015 6.915025
Name: Value, dtype: float64
<ipython-input-31-29986ce04bff>:336: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df3_filtered.rename(columns={'Country Name': 'country', 'Country Code': 'country_code'}, inplace=True)
merged_df = filtered_df.merge(df2_filtered, on=['country', 'year'], how='left')
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year sex age suicides_no population \
0 Albania 2000 male 25-34 years 17 232000
1 Albania 2000 male 55-74 years 10 177400
2 Albania 2000 female 75+ years 2 37800
3 Albania 2000 male 75+ years 1 24900
4 Albania 2000 female 15-24 years 6 263900
... ... ... ... ... ... ...
16003 Uzbekistan 2014 female 35-54 years 107 3620833
16004 Uzbekistan 2014 female 75+ years 9 348465
16005 Uzbekistan 2014 male 5-14 years 60 2762158
16006 Uzbekistan 2014 female 5-14 years 44 2631600
16007 Uzbekistan 2014 female 55-74 years 21 1438935
suicides/100k pop country-year HDI for year gdp_for_year ($) \
0 7.33 Albania2000 0.656 3.632044e+09
1 5.64 Albania2000 0.656 3.632044e+09
2 5.29 Albania2000 0.656 3.632044e+09
3 4.02 Albania2000 0.656 3.632044e+09
4 2.27 Albania2000 0.656 3.632044e+09
... ... ... ... ...
16003 2.96 Uzbekistan2014 0.675 6.306708e+10
16004 2.58 Uzbekistan2014 0.675 6.306708e+10
16005 2.17 Uzbekistan2014 0.675 6.306708e+10
16006 1.67 Uzbekistan2014 0.675 6.306708e+10
16007 1.46 Uzbekistan2014 0.675 6.306708e+10
gdp_per_capita ($) generation \
0 1299 Generation X
1 1299 Silent
2 1299 G.I. Generation
3 1299 G.I. Generation
4 1299 Generation X
... ... ...
16003 2309 Generation X
16004 2309 Silent
16005 2309 Generation Z
16006 2309 Generation Z
16007 2309 Boomers
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent)
0 3.93
1 3.93
2 3.93
3 3.93
4 3.93
... ...
16003 2.12
16004 2.12
16005 2.12
16006 2.12
16007 2.12
[16008 rows x 13 columns]
merged_df = merged_df.merge(df3_long, on=['country', 'year'], how='left')
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year sex age suicides_no population \
0 Albania 2000 male 25-34 years 17 232000
1 Albania 2000 male 55-74 years 10 177400
2 Albania 2000 female 75+ years 2 37800
3 Albania 2000 male 75+ years 1 24900
4 Albania 2000 female 15-24 years 6 263900
... ... ... ... ... ... ...
16003 Uzbekistan 2014 female 35-54 years 107 3620833
16004 Uzbekistan 2014 female 75+ years 9 348465
16005 Uzbekistan 2014 male 5-14 years 60 2762158
16006 Uzbekistan 2014 female 5-14 years 44 2631600
16007 Uzbekistan 2014 female 55-74 years 21 1438935
suicides/100k pop country-year HDI for year gdp_for_year ($) \
0 7.33 Albania2000 0.656 3.632044e+09
1 5.64 Albania2000 0.656 3.632044e+09
2 5.29 Albania2000 0.656 3.632044e+09
3 4.02 Albania2000 0.656 3.632044e+09
4 2.27 Albania2000 0.656 3.632044e+09
... ... ... ... ...
16003 2.96 Uzbekistan2014 0.675 6.306708e+10
16004 2.58 Uzbekistan2014 0.675 6.306708e+10
16005 2.17 Uzbekistan2014 0.675 6.306708e+10
16006 1.67 Uzbekistan2014 0.675 6.306708e+10
16007 1.46 Uzbekistan2014 0.675 6.306708e+10
gdp_per_capita ($) generation \
0 1299 Generation X
1 1299 Silent
2 1299 G.I. Generation
3 1299 G.I. Generation
4 1299 Generation X
... ... ...
16003 2309 Generation X
16004 2309 Silent
16005 2309 Generation Z
16006 2309 Generation Z
16007 2309 Boomers
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent) \
0 3.93
1 3.93
2 3.93
3 3.93
4 3.93
... ...
16003 2.12
16004 2.12
16005 2.12
16006 2.12
16007 2.12
country_code gdp
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 NaN NaN
4 NaN NaN
... ... ...
16003 NaN NaN
16004 NaN NaN
16005 NaN NaN
16006 NaN NaN
16007 NaN NaN
[16008 rows x 15 columns]
merged_df = merged_df.merge(df4_filtered, on=['country', 'year'], how='left')
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year sex age suicides_no population \
804 Australia 2000 male 25-34 years 466 1430700
805 Australia 2000 male 75+ years 115 416077
806 Australia 2000 male 35-54 years 745 2769752
807 Australia 2000 male 15-24 years 271 1333011
808 Australia 2000 male 55-74 years 281 1522620
... ... ... ... ... ... ...
15679 United States 2015 female 25-34 years 1444 21555712
15680 United States 2015 female 15-24 years 1132 21633813
15681 United States 2015 female 75+ years 540 11778666
15682 United States 2015 male 5-14 years 255 21273987
15683 United States 2015 female 5-14 years 158 20342901
suicides/100k pop gdp_for_year ($) gdp_per_capita ($) \
804 32.57 4.150342e+11 23219
805 27.64 4.150342e+11 23219
806 26.90 4.150342e+11 23219
807 20.33 4.150342e+11 23219
808 18.46 4.150342e+11 23219
... ... ... ...
15679 6.70 1.812071e+13 60387
15680 5.23 1.812071e+13 60387
15681 4.58 1.812071e+13 60387
15682 1.20 1.812071e+13 60387
15683 0.78 1.812071e+13 60387
generation \
804 Generation X
805 G.I. Generation
806 Boomers
807 Generation X
808 Silent
... ...
15679 Millenials
15680 Millenials
15681 Silent
15682 Generation Z
15683 Generation Z
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent) \
804 5.61
805 5.61
806 5.61
807 5.61
808 5.61
... ...
15679 5.61
15680 5.61
15681 5.61
15682 5.61
15683 5.61
value
804 6.285546
805 6.285546
806 6.285546
807 6.285546
808 6.285546
... ...
15679 5.291667
15680 5.291667
15681 5.291667
15682 5.291667
15683 5.291667
[4128 rows x 12 columns]
merged_df.isnull().sum()
-------------------------------------------------------------------------------------------------------------------------------------------------------
country 0
year 0
sex 0
age 0
suicides_no 0
population 0
suicides/100k pop 0
gdp_for_year ($) 0
gdp_per_capita ($) 0
generation 0
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent) 0
value 0
dtype: int64
merged_df.describe(include='all')
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year sex age suicides_no population \
count 4128 4128.000000 4128 4128 4128.000000 4.128000e+03
unique 24 NaN 2 6 NaN NaN
top Italy NaN male 25-34 years NaN NaN
freq 192 NaN 2064 688 NaN NaN
mean NaN 2007.720930 NaN NaN 424.522771 3.507311e+06
std NaN 4.558069 NaN NaN 1162.719207 5.880734e+06
min NaN 2000.000000 NaN NaN 0.000000 6.532000e+03
25% NaN 2004.000000 NaN NaN 17.000000 4.877772e+05
50% NaN 2008.000000 NaN NaN 88.000000 1.277342e+06
75% NaN 2012.000000 NaN NaN 290.000000 3.959502e+06
max NaN 2015.000000 NaN NaN 11767.000000 4.380521e+07
suicides/100k pop gdp_for_year ($) gdp_per_capita ($) generation \
count 4128.000000 4.128000e+03 4128.000000 4128
unique NaN NaN NaN 6
top NaN NaN NaN Millenials
freq NaN NaN NaN 1156
mean 11.867384 1.563796e+12 40700.299419 NaN
std 13.023472 3.035203e+12 23309.800208 NaN
min 0.000000 1.131644e+10 4866.000000 NaN
25% 2.340000 2.382038e+11 24759.000000 NaN
50% 7.670000 4.880970e+11 40347.500000 NaN
75% 17.502500 1.464961e+12 51772.000000 NaN
max 99.840000 1.812071e+13 126352.000000 NaN
Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent) \
count 4128.000000
unique NaN
top NaN
freq NaN
mean 5.677878
std 1.341483
min 2.560000
25% 4.730000
50% 5.830000
75% 6.482500
max 8.780000
value
count 4128.000000
unique NaN
top NaN
freq NaN
mean 7.641546
std 4.035549
min 1.900000
25% 5.031250
50% 6.870833
75% 9.008333
max 27.825000
grouped_data = merged_df.groupby(['country', 'year']).agg
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year suicides_no population gdp_for_year ($) \
0 Australia 2000 2391 17874410 4.150342e+11
1 Australia 2001 2458 18130883 3.782151e+11
2 Australia 2002 2319 18370058 3.944867e+11
3 Australia 2003 2156 18608029 4.662947e+11
4 Australia 2004 2114 18854551 6.119043e+11
.. ... ... ... ... ...
339 United States 2011 39508 290313825 1.551793e+13
340 United States 2012 40596 292827128 1.615526e+13
341 United States 2013 41143 295322862 1.669152e+13
342 United States 2014 42769 297749735 1.742761e+13
343 United States 2015 44189 300078511 1.812071e+13
gdp_per_capita ($) Anxiety Prevalence (%) Unemployment Rate(%) \
0 23219 5.61 6.285546
1 20860 5.66 6.742173
2 21474 5.78 6.368911
3 25059 5.92 5.928420
4 32454 6.04 5.396734
.. ... ... ...
339 53452 6.22 8.950000
340 55170 6.03 8.066667
341 56520 5.84 7.375000
342 58531 5.68 6.166667
343 60387 5.61 5.291667
suicides/100k pop
0 13.376665
1 13.556979
2 12.623803
3 11.586396
4 11.212147
.. ...
339 13.608722
340 13.863470
341 13.931532
342 14.364077
343 14.725813
[344 rows x 9 columns]
grouped_data.describe(include='all')
-------------------------------------------------------------------------------------------------------------------------------------------------------
country year suicides_no population gdp_for_year ($) \
count 344 344.000000 344.000000 3.440000e+02 3.440000e+02
unique 24 NaN NaN NaN NaN
top Italy NaN NaN NaN NaN
freq 16 NaN NaN NaN NaN
mean NaN 2007.720930 5094.273256 4.208773e+07 1.563796e+12
std NaN 4.564155 9151.842382 6.224772e+07 3.039256e+12
min NaN 2000.000000 26.000000 2.683300e+05 1.131644e+10
25% NaN 2004.000000 905.750000 7.858817e+06 2.382038e+11
50% NaN 2008.000000 1844.500000 1.542053e+07 4.880970e+11
75% NaN 2012.000000 3980.750000 5.648715e+07 1.464961e+12
max NaN 2015.000000 44189.000000 3.000785e+08 1.812071e+13
gdp_per_capita ($) Anxiety Prevalence (%) Unemployment Rate(%) \
count 344.000000 344.000000 344.000000
unique NaN NaN NaN
top NaN NaN NaN
freq NaN NaN NaN
mean 40700.299419 5.677878 7.641546
std 23340.927117 1.343274 4.040938
min 4866.000000 2.560000 1.900000
25% 24759.000000 4.730000 5.031250
50% 40347.500000 5.830000 6.870833
75% 51772.000000 6.482500 9.008333
max 126352.000000 8.780000 27.825000
suicides/100k pop
count 344.000000
unique NaN
top NaN
freq NaN
mean 12.036260
std 5.413287
min 1.591815
25% 7.764163
50% 11.738373
75% 14.491590
max 26.480336
14068750000000.0 14450000000000.0 2398363700004.65 3650000000000.0
count 16.00 mean 6.48 std 0.52 min 5.61 25% 6.17 50% 6.49 75% 6.82 max 7.31 Name: Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent), dtype: float64 6.485
count 16008.00 mean 237.05 std 884.03 min 0.00 25% 2.00 50% 23.00 75% 120.00 max 21262.00 Name: suicides_no, dtype: float64 23.0
count 180.00 mean 6.01 std 0.18 min 5.61 25% 5.92 50% 6.08 75% 6.14 max 6.16 Name: Prevalence - Anxiety disorders - Sex: Both - Age: Age-standardized (Percent), dtype: float64 6.045
<ipython-input-31-29986ce04bff>:611: UserWarning: FixedFormatter should only be used together with FixedLocator axs[1, 0].set_xticklabels(average_suicide_rates_by_age.index, rotation=45, ha='right') <ipython-input-31-29986ce04bff>:620: UserWarning: FixedFormatter should only be used together with FixedLocator axs[1, 1].set_xticklabels(average_suicide_rates_by_generation.index, rotation=45, ha='right')
In [ ]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from statsmodels.tsa.arima.model import ARIMA
# Assuming your data is in a pandas DataFrame called "data"
# Calculate correlation matrix
# Display correlation matrix
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
# ------------------------------
df = pd.read_csv("transformed_data.csv")
# ------------------------------
def predict_future(df, independent_var):
# Group the data by year and calculate the mean suicide rate for the given independent variable
df_yearly = df.groupby('year')[independent_var].mean().reset_index()
# Set the year as the index
df_yearly.set_index('year', inplace=True)
# Determine the order of the ARIMA model (p, d, q)
# You may need to experiment with different values or use an automated approach
p, d, q = 1, 1, 1
# Fit the ARIMA model
model = ARIMA(df_yearly, order=(p, d, q))
model_fit = model.fit()
# Make predictions for the next 5 years
forecast_years = 5
forecast = model_fit.forecast(steps=forecast_years)
# Plot the historical data and the predictions
plt.figure(figsize=(12, 6))
plt.plot(df_yearly.index, df_yearly[independent_var], label='Historical Data')
plt.plot(np.arange(df_yearly.index[-1]+1, df_yearly.index[-1]+1+forecast_years), forecast, label='Predictions', linestyle='--', color='red')
plt.xlabel('Year')
plt.ylabel(independent_var)
plt.title(f'{independent_var} : Historical Data and Predictions')
plt.legend()
plt.show()
return forecast
def psk(x,y):
Pearson = scipy.stats.pearsonr(x,y)[0]
Spearmanr = scipy.stats.spearmanr(x,y)[0]
Kendall = scipy.stats.kendalltau(x,y)[0]
print("Pearson: " + str(Pearson))
print("Spearmanr: " + str(Spearmanr))
print("Kendall: " + str(Kendall))
def visual(x,y):
slope, intercept, r, p, stderr = scipy.stats.linregress(x,y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x,r={r:.2f}'
fig, ax = plt.subplots()
ax.plot(x,y, linewidth=0, marker='s', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.legend(facecolor='white',loc='upper left')
plt.show
def data_analytics(val, country):
filtered_data = grouped_data[grouped_data['country'] == country]
# Preparing data for linear regression
X = filtered_data[val]
y = filtered_data['suicides/100k pop']
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Fitting the linear regression model
model = LinearRegression()
X_train_reshaped = X_train.values.reshape(-1, 1)
y_train_reshaped = y_train.values.reshape(-1, 1)
X_test_reshaped = X_test.values.reshape(-1, 1)
model.fit(X_train_reshaped, y_train_reshaped)
# Making predictions
y_pred = model.predict(X_test_reshaped)
# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
df_preds = pd.DataFrame({'Actual': y_test.squeeze(), 'Predicted': y_pred.squeeze()})
print(f'Current country being analyzed is {country} and the current independent variable is {val}')
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(df_preds.reset_index())
print()
print()
print()
print()
print("Mean Squared Error")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(mse)
print()
print()
print()
print()
print("R-squared")
print("-------------------------------------------------------------------------------------------------------------------------------------------------------")
print(r2)
print()
print()
print()
print()
plt.scatter(X_train, y_train,color='b')
plt.plot(X_test, y_pred,color='k')
plt.show()
plt.figure(figsize=(12, 6))
plt.scatter(df_preds.index, df_preds['Actual'], label='Actual', alpha=0.7)
plt.scatter(df_preds.index, df_preds['Predicted'], label='Predicted', alpha=0.7)
plt.xlabel("Index")
plt.ylabel("Suicides/100k pop")
plt.title("Actual vs Predicted Suicide Rates")
plt.legend()
plt.show()
residuals = df_preds['Actual'] - df_preds['Predicted']
plt.figure(figsize=(12, 6))
plt.scatter(df_preds.index, residuals, alpha=0.7)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Index")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.show()
#sns.pairplot(filtered_data[['suicides/100k pop', 'year', 'gdp_per_capita ($)', 'Anxiety Prevalence (%)']])
plt.show()
df_yearly = df.groupby('year')['suicides/100k pop'].mean().reset_index()
df_yearly.to_csv('df_yearly.csv', index=False)
# Set the year as the index
df_yearly.set_index('year', inplace=True)
# Determine the order of the ARIMA model (p, d, q)
# You may need to experiment with different values or use an automated approach
p, d, q = 1, 1, 1
# Fit the ARIMA model
model = ARIMA(df_yearly, order=(p, d, q))
model_fit = model.fit()
# Make predictions for the next 5 years
forecast_years = 5
forecast = model_fit.forecast(steps=forecast_years)
# Plot the historical data and the predictions
plt.figure(figsize=(12, 6))
plt.plot(df_yearly.index, df_yearly['suicides/100k pop'], label='Historical Data')
plt.plot(np.arange(df_yearly.index[-1]+1, df_yearly.index[-1]+1+forecast_years), forecast, label='Predictions', linestyle='--', color='red')
plt.xlabel('Year')
plt.ylabel('Suicides/100k pop')
plt.title('Suicide Rates: Historical Data and Predictions')
plt.legend()
plt.show()
def diff_country(country):
filtered_data = grouped_data[grouped_data['country'] == country]
correlation_matrix = filtered_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix")
plt.show()
data_analytics('year', country)
data_analytics('population', country)
data_analytics(' gdp_for_year ($) ', country)
data_analytics('gdp_per_capita ($)', country)
data_analytics('Anxiety Prevalence (%)', country)
data_analytics('Unemployment Rate(%)', country)
diff_country('United States')
<ipython-input-42-f5e85508bd51>:180: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. correlation_matrix = filtered_data.corr()
Current country being analyzed is United States and the current independent variable is year ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 328 11.189108 10.869199 1 329 11.531207 11.111481 2 333 11.817534 12.080610 3 342 14.364077 14.261149 Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.08957859509325357 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.9430894492071287
0 13.376665
1 13.556979
2 12.623803
3 11.586396
4 11.212147
...
75 10.903971
76 11.095198
84 4.829425
85 5.241577
95 20.033117
Name: suicides/100k pop, Length: 80, dtype: float64
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-42-f5e85508bd51> in <cell line: 193>() 191 192 --> 193 diff_country('United States') 194 195 <ipython-input-42-f5e85508bd51> in diff_country(country) 183 plt.title("Correlation Matrix") 184 plt.show() --> 185 data_analytics('year', country) 186 data_analytics('population', country) 187 data_analytics(' gdp_for_year ($) ', country) <ipython-input-42-f5e85508bd51> in data_analytics(val, country) 153 print(df_yearly.head()) 154 # Set the year as the index --> 155 df_yearly.set_index('year', inplace=True) 156 157 # Determine the order of the ARIMA model (p, d, q) /usr/local/lib/python3.10/dist-packages/pandas/core/groupby/groupby.py in __getattr__(self, attr) 985 return self[attr] 986 --> 987 raise AttributeError( 988 f"'{type(self).__name__}' object has no attribute '{attr}'" 989 ) AttributeError: 'SeriesGroupBy' object has no attribute 'set_index'
In [ ]:
diff_country('Austria')
<ipython-input-132-e3896bfb6725>:124: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. correlation_matrix = filtered_data.corr()
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 5013.347647238664 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.5183757093701598 Current country being analyzed is Austria and the current independent variable is year ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 15 1588 1484.749097 1 16 1489 1465.859206 2 20 1399 1390.299639 3 29 1314 1220.290614
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 4473.375039995245 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.5702500141705459 Current country being analyzed is Austria and the current independent variable is population ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 15 1588 1500.083688 1 16 1489 1478.097520 2 20 1399 1378.275132 3 29 1314 1215.939620
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 2939.988661563725 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.7175599912036194 Current country being analyzed is Austria and the current independent variable is gdp_for_year ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 15 1588 1535.030735 1 16 1489 1534.364861 2 20 1399 1387.557576 3 29 1314 1231.748483
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 2651.2874003561037 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.7452950596482837 Current country being analyzed is Austria and the current independent variable is gdp_per_capita ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 15 1588 1533.877062 1 16 1489 1534.642326 2 20 1399 1384.495454 3 29 1314 1240.636157
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 5629.051857749779 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.45922599056130087 Current country being analyzed is Austria and the current independent variable is Anxiety Prevalence (%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 15 1588 1467.543761 1 16 1489 1467.543761 2 20 1399 1401.990834 3 29 1314 1227.183028
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 12741.638088586747 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -0.22406879348528919 Current country being analyzed is Austria and the current independent variable is Unemployment Rate(%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 15 1588 1419.459192 1 16 1489 1415.995454 2 20 1399 1273.982170 3 29 1314 1273.982170
In [ ]:
diff_country('Japan')
<ipython-input-132-e3896bfb6725>:124: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. correlation_matrix = filtered_data.corr()
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 5707855.394925548 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.018775807371588793 Current country being analyzed is Japan and the current independent variable is year ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 181 29989 32954.983755 1 182 29132 32462.965704 2 186 30369 30494.893502 3 195 24357 26066.731047
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 5946874.315872792 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -0.022313381387096154 Current country being analyzed is Japan and the current independent variable is population ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 181 29989 28282.626009 1 182 29132 28492.827007 2 186 30369 28889.633548 3 195 24357 28632.361722
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 6038046.361940648 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -0.037986556529680415 Current country being analyzed is Japan and the current independent variable is gdp_for_year ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 181 29989 28938.871649 1 182 29132 29295.339644 2 186 30369 29019.513130 3 195 24357 28961.521826
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 6031312.843286734 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -0.03682901294323826 Current country being analyzed is Japan and the current independent variable is gdp_per_capita ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 181 29989 28918.854599 1 182 29132 29295.145945 2 186 30369 29024.927874 3 195 24357 28955.575183
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 2062466.1252913745 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.645446228295895 Current country being analyzed is Japan and the current independent variable is Anxiety Prevalence (%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 181 29989 31062.181585 1 182 29132 31062.181585 2 186 30369 31237.599989 3 195 24357 25975.047867
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 2396783.8616983127 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.5879744410325223 Current country being analyzed is Japan and the current independent variable is Unemployment Rate(%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 181 29989 29430.351490 1 182 29132 30204.497356 2 186 30369 28717.320088 3 195 24357 26680.090652
In [ ]:
diff_country('United Kingdom')
<ipython-input-132-e3896bfb6725>:124: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. correlation_matrix = filtered_data.corr()
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 90182.91019529989 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -0.08938020180968498 Current country being analyzed is United Kingdom and the current independent variable is year ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 312 4290 3781.956679 1 313 4128 3843.741877 2 317 4047 4090.882671 3 326 4788 4646.949458
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 41970.64624859675 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.49300825420954164 Current country being analyzed is United Kingdom and the current independent variable is population ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 312 4290 3974.475267 1 313 4128 3881.058459 2 317 4047 4086.711060 3 326 4788 4712.041887
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 63865.17679931215 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.22852945153823756 Current country being analyzed is United Kingdom and the current independent variable is gdp_for_year ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 312 4290 4045.148192 1 313 4128 4037.766636 2 317 4047 4288.792185 3 326 4788 4428.969669
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 67762.14308567054 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.18145536721023037 Current country being analyzed is United Kingdom and the current independent variable is gdp_per_capita ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 312 4290 4160.493019 1 313 4128 4158.807412 2 317 4047 4301.876813 3 326 4788 4353.989369
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 35857.30641989976 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.5668554095286011 Current country being analyzed is United Kingdom and the current independent variable is Anxiety Prevalence (%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 312 4290 4118.337109 1 313 4128 4100.780103 2 317 4047 4030.552081 3 326 4788 4451.920217
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 69588.78431017889 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.15939013576583083 Current country being analyzed is United Kingdom and the current independent variable is Unemployment Rate(%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 312 4290 4226.085825 1 313 4128 4195.909163 2 317 4047 4173.779610 3 326 4788 4284.427372
In [ ]:
diff_country('Finland')
<ipython-input-132-e3896bfb6725>:124: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. correlation_matrix = filtered_data.corr()
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 2261.841705059185 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.9159338534850054 Current country being analyzed is Finland and the current independent variable is year ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 90 1165 1179.453971 1 91 1204 1155.069495 2 95 994 1057.531588 3 104 789 838.071300
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 2579.752702998046 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.9041180166509433 Current country being analyzed is Finland and the current independent variable is population ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 90 1165 1159.687475 1 91 1204 1139.003747 2 95 994 1063.320081 3 104 789 824.510582
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 8329.064088695868 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.6904326591702117 Current country being analyzed is Finland and the current independent variable is gdp_for_year ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 90 1165 1106.867465 1 91 1204 1102.354892 2 95 994 1010.909636 3 104 789 927.993528
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 10212.643621013736 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.6204254289638276 Current country being analyzed is Finland and the current independent variable is gdp_per_capita ($) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 90 1165 1092.867164 1 91 1204 1089.067913 2 95 994 1004.658467 3 104 789 938.413656
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 7816.6685189323625 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- 0.7094769278053794 Current country being analyzed is Finland and the current independent variable is Anxiety Prevalence (%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 90 1165 1101.086325 1 91 1204 1101.086325 2 95 994 1078.250412 3 104 789 886.428742
Mean Squared Error ------------------------------------------------------------------------------------------------------------------------------------------------------- 35469.26202751885 R-squared ------------------------------------------------------------------------------------------------------------------------------------------------------- -0.3182903877467005 Current country being analyzed is Finland and the current independent variable is Unemployment Rate(%) ------------------------------------------------------------------------------------------------------------------------------------------------------- index Actual Predicted 0 90 1165 942.768684 1 91 1204 956.616986 2 95 994 971.867632 3 104 789 964.505251
In [ ]:
usa = df.loc[df['country'] == 'United States']
y = usa['suicides_no']
x = usa['Unemployment Rate(%)']
psk(x,y)
visual(x,y)
Pearson: 0.5229669784472624 Spearmanr: 0.5537561233777729 Kendall: 0.3025316904537665
In [ ]:
x = usa[' gdp_for_year ($) ']
psk(x,y)
visual(x,y)
Pearson: 0.9694943115190687 Spearmanr: 0.988235294117647 Kendall: 0.9500000000000001
In [ ]:
x = usa['population']
psk(x,y)
visual(x,y)
Pearson: 0.9856287427902889 Spearmanr: 0.9970588235294118 Kendall: 0.9833333333333333
In [ ]:
x = usa['Anxiety Prevalence (%)']
psk(x,y)
visual(x,y)
Pearson: -0.9728282957877852 Spearmanr: -0.9970588235294118 Kendall: -0.9833333333333333
In [ ]:
UK = df.loc[df['country'] == 'United Kingdom']
y = UK['suicides_no']
x = UK['Unemployment Rate(%)']
psk(x,y)
visual(x,y)
Pearson: 0.3784882272862713 Spearmanr: 0.6294117647058822 Kendall: 0.38333333333333336
In [ ]:
x = UK[' gdp_for_year ($) ']
psk(x,y)
visual(x,y)
Pearson: 0.3911777286517507 Spearmanr: 0.31176470588235294 Kendall: 0.25
In [ ]:
x = UK['population']
psk(x,y)
visual(x,y)
Pearson: 0.8574078006265712 Spearmanr: 0.7411764705882353 Kendall: 0.5666666666666667
In [ ]:
x = UK['Anxiety Prevalence (%)']
psk(x,y)
visual(x,y)
Pearson: 0.6762483923632745 Spearmanr: 0.7016317042084812 Kendall: 0.4852162225324867
In [ ]:
Austria = df.loc[df['country'] == 'Austria']
y = Austria['suicides_no']
x = Austria['Unemployment Rate(%)']
psk(x,y)
visual(x,y)
Pearson: -0.5903485553040041 Spearmanr: -0.3873347352349848 Kendall: -0.235302425908485
In [ ]:
x = Austria[' gdp_for_year ($) ']
psk(x,y)
visual(x,y)
Pearson: -0.9296965158168207 Spearmanr: -0.6764705882352942 Kendall: -0.5166666666666667
In [ ]:
x = Austria['population']
psk(x,y)
visual(x,y)
Pearson: -0.8643322853027127 Spearmanr: -0.7794117647058825 Kendall: -0.6333333333333333
In [ ]:
x = Austria['Anxiety Prevalence (%)']
psk(x,y)
visual(x,y)
Pearson: -0.8478652313303219 Spearmanr: -0.7770421529709435 Kendall: -0.6276205565667923
In [ ]:
Japan= df.loc[df['country'] == 'Japan']
y = Japan['suicides_no']
x = Japan['Unemployment Rate(%)']
psk(x,y)
visual(x,y)
Pearson: 0.6664818726262071 Spearmanr: 0.39882277169531377 Kendall: 0.32636268941473195
In [ ]:
x = Japan[' gdp_for_year ($) ']
psk(x,y)
visual(x,y)
Pearson: -0.1445703007592053 Spearmanr: -0.1647058823529412 Kendall: -0.13333333333333333
In [ ]:
x = Japan['population']
psk(x,y)
visual(x,y)
Pearson: 0.06208728311481601 Spearmanr: 0.06176470588235294 Kendall: 0.03333333333333333
In [ ]:
x = Japan['Anxiety Prevalence (%)']
psk(x,y)
visual(x,y)
Pearson: 0.8257599841197141 Spearmanr: 0.6879398467484049 Kendall: 0.5753670112546582
In [ ]:
Finland= df.loc[df['country'] == 'Finland']
y = Finland['suicides_no']
x = Finland['Unemployment Rate(%)']
psk(x,y)
visual(x,y)
Pearson: 0.09726113184467595 Spearmanr: 0.3294117647058824 Kendall: 0.25
In [ ]:
x = Finland[' gdp_for_year ($) ']
psk(x,y)
visual(x,y)
Pearson: -0.7187050203184171 Spearmanr: -0.7382352941176471 Kendall: -0.5833333333333334
In [ ]:
x = Finland['population']
psk(x,y)
visual(x,y)
Pearson: -0.9497484372918075 Spearmanr: -0.9529411764705882 Kendall: -0.8499999999999999
In [ ]:
x = Finland['Anxiety Prevalence (%)']
psk(x,y)
visual(x,y)
Pearson: -0.8413197138786257 Spearmanr: -0.9565240766177093 Kendall: -0.8692679567468138